{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.124429Z",
     "start_time": "2025-06-19T12:43:29.107879Z"
    }
   },
   "source": [
    "import pymysql\n",
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.168573Z",
     "start_time": "2025-06-19T12:43:29.155124Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 数据库连接配置\n",
    "db_config = {\n",
    "    'host': 'localhost',  # 数据库主机地址\n",
    "    'user': 'root',  # 用户名\n",
    "    'password': '123456',  # 密码\n",
    "    'database': 'pachong',  # 数据库名称\n",
    "}"
   ],
   "id": "a836f1c191415862",
   "outputs": [],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.488744Z",
     "start_time": "2025-06-19T12:43:29.482423Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def method1():\n",
    "    try:\n",
    "        # 建立数据库连接\n",
    "        conn = pymysql.connect(**db_config)\n",
    "        # SQL查询语句\n",
    "        sql_query = \"SELECT * FROM books\"\n",
    "        # 直接读取为DataFrame\n",
    "        df = pd.read_sql(sql_query, conn)\n",
    "        return df\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"数据库操作失败: {e}\")\n",
    "    finally:\n",
    "        if 'conn' in locals() and conn.open:\n",
    "            conn.close()"
   ],
   "id": "f2378bc8d21e2133",
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.569419Z",
     "start_time": "2025-06-19T12:43:29.495266Z"
    }
   },
   "cell_type": "code",
   "source": "df = method1()",
   "id": "56aff17a228c9321",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\86188\\AppData\\Local\\Temp\\ipykernel_23088\\1993941310.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
      "  df = pd.read_sql(sql_query, conn)\n"
     ]
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.604924Z",
     "start_time": "2025-06-19T12:43:29.582740Z"
    }
   },
   "cell_type": "code",
   "source": "df.head()",
   "id": "61368f71957cea5",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   id       title      author  publisher  price                     tags  \\\n",
       "0   1       女性潜意识         阎勤民      花城出版社    NaN  心理咨询, 心理医生, 女性, 潜意识, 意识   \n",
       "1   2       怪诞行为学   [美]丹· 艾瑞里      中信出版社   30.2                      未分类   \n",
       "2   3     人的潜能和价值    [美]马斯洛等著      华夏出版社    NaN                人本主义, 马斯洛   \n",
       "3   4       语言心理学  (美)D·W·卡罗尔  华东师范大学出版社   42.3                      教科书   \n",
       "4   5  蛤蟆先生去看心理医生  [英]罗伯特·戴博德    天津人民出版社    NaN       心理咨询, 情绪, 心理医生, 治愈   \n",
       "\n",
       "                                 url  fav_count  commend_count  \\\n",
       "0  http://www.ixinqing.com/book/1596        163              2   \n",
       "1    http://www.ixinqing.com/book/28        248              3   \n",
       "2  http://www.ixinqing.com/book/7466        130              0   \n",
       "3   http://www.ixinqing.com/book/165        622              0   \n",
       "4  http://www.ixinqing.com/book/7662       1167              0   \n",
       "\n",
       "                                             summary  \n",
       "0  这是我国第一本以记录和分析心理门诊、心理咨询实例的形式，集中论述女性心理、开解女性性别密码的...  \n",
       "1  《怪诞行为学》中讲述的是我们常常暗下决心节食减肥，但是只要看到甜点小推车一过来，我们的决心就...  \n",
       "2  《人的潜能和价值》是人本主义心理学有关“人的潜能和价值”讨论的论文集。主要选录了马斯洛有关人...  \n",
       "3  心理语言学研究对认知科学、对心理学和语言学的理论建设具有重要作用。在语言教学、言语缺陷的诊断...  \n",
       "4  蛤蟆先生一向爱笑爱闹，如今却一反常态地郁郁寡欢，他一个人躲在屋里，连起床梳洗的力气都没有。朋...  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "      <th>publisher</th>\n",
       "      <th>price</th>\n",
       "      <th>tags</th>\n",
       "      <th>url</th>\n",
       "      <th>fav_count</th>\n",
       "      <th>commend_count</th>\n",
       "      <th>summary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>女性潜意识</td>\n",
       "      <td>阎勤民</td>\n",
       "      <td>花城出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>心理咨询, 心理医生, 女性, 潜意识, 意识</td>\n",
       "      <td>http://www.ixinqing.com/book/1596</td>\n",
       "      <td>163</td>\n",
       "      <td>2</td>\n",
       "      <td>这是我国第一本以记录和分析心理门诊、心理咨询实例的形式，集中论述女性心理、开解女性性别密码的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>怪诞行为学</td>\n",
       "      <td>[美]丹· 艾瑞里</td>\n",
       "      <td>中信出版社</td>\n",
       "      <td>30.2</td>\n",
       "      <td>未分类</td>\n",
       "      <td>http://www.ixinqing.com/book/28</td>\n",
       "      <td>248</td>\n",
       "      <td>3</td>\n",
       "      <td>《怪诞行为学》中讲述的是我们常常暗下决心节食减肥，但是只要看到甜点小推车一过来，我们的决心就...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>人的潜能和价值</td>\n",
       "      <td>[美]马斯洛等著</td>\n",
       "      <td>华夏出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>人本主义, 马斯洛</td>\n",
       "      <td>http://www.ixinqing.com/book/7466</td>\n",
       "      <td>130</td>\n",
       "      <td>0</td>\n",
       "      <td>《人的潜能和价值》是人本主义心理学有关“人的潜能和价值”讨论的论文集。主要选录了马斯洛有关人...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>语言心理学</td>\n",
       "      <td>(美)D·W·卡罗尔</td>\n",
       "      <td>华东师范大学出版社</td>\n",
       "      <td>42.3</td>\n",
       "      <td>教科书</td>\n",
       "      <td>http://www.ixinqing.com/book/165</td>\n",
       "      <td>622</td>\n",
       "      <td>0</td>\n",
       "      <td>心理语言学研究对认知科学、对心理学和语言学的理论建设具有重要作用。在语言教学、言语缺陷的诊断...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>蛤蟆先生去看心理医生</td>\n",
       "      <td>[英]罗伯特·戴博德</td>\n",
       "      <td>天津人民出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>心理咨询, 情绪, 心理医生, 治愈</td>\n",
       "      <td>http://www.ixinqing.com/book/7662</td>\n",
       "      <td>1167</td>\n",
       "      <td>0</td>\n",
       "      <td>蛤蟆先生一向爱笑爱闹，如今却一反常态地郁郁寡欢，他一个人躲在屋里，连起床梳洗的力气都没有。朋...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.704356Z",
     "start_time": "2025-06-19T12:43:29.693255Z"
    }
   },
   "cell_type": "code",
   "source": "df.shape",
   "id": "d05fe61fad958d2f",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3048, 10)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.848761Z",
     "start_time": "2025-06-19T12:43:29.840739Z"
    }
   },
   "cell_type": "code",
   "source": "df.columns = ['id','标题','作者','出版社','价格','标签','URL','收藏数','推荐数','简介']",
   "id": "aba5e0b5d2539fec",
   "outputs": [],
   "execution_count": 8
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.885437Z",
     "start_time": "2025-06-19T12:43:29.856786Z"
    }
   },
   "cell_type": "code",
   "source": "df.describe()",
   "id": "de177d7bcf55a151",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "                id           价格          收藏数          推荐数\n",
       "count  3048.000000  1992.000000  3048.000000  3048.000000\n",
       "mean   1524.500000    50.483951    14.549541     0.102362\n",
       "std     880.026136   159.915577   110.090273     1.275873\n",
       "min       1.000000     0.000000     0.000000     0.000000\n",
       "25%     762.750000    15.300000     0.000000     0.000000\n",
       "50%    1524.500000    22.000000     1.000000     0.000000\n",
       "75%    2286.250000    35.050000     4.000000     0.000000\n",
       "max    3048.000000  2921.000000  4162.000000    62.000000"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>价格</th>\n",
       "      <th>收藏数</th>\n",
       "      <th>推荐数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>3048.000000</td>\n",
       "      <td>1992.000000</td>\n",
       "      <td>3048.000000</td>\n",
       "      <td>3048.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>1524.500000</td>\n",
       "      <td>50.483951</td>\n",
       "      <td>14.549541</td>\n",
       "      <td>0.102362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>880.026136</td>\n",
       "      <td>159.915577</td>\n",
       "      <td>110.090273</td>\n",
       "      <td>1.275873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>762.750000</td>\n",
       "      <td>15.300000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>1524.500000</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>2286.250000</td>\n",
       "      <td>35.050000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>3048.000000</td>\n",
       "      <td>2921.000000</td>\n",
       "      <td>4162.000000</td>\n",
       "      <td>62.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:29.975202Z",
     "start_time": "2025-06-19T12:43:29.935970Z"
    }
   },
   "cell_type": "code",
   "source": [
    "missing_info = df.isnull().sum()\n",
    "duplicated_values = df.duplicated().sum()\n",
    "print(\"各列缺失值统计：\\n\", missing_info)\n",
    "print(\"\\n数据重复值统计：\\n\", duplicated_values)"
   ],
   "id": "e586f762f931e413",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "各列缺失值统计：\n",
      " id        0\n",
      "标题        0\n",
      "作者        0\n",
      "出版社       0\n",
      "价格     1056\n",
      "标签        0\n",
      "URL       0\n",
      "收藏数       0\n",
      "推荐数       0\n",
      "简介        0\n",
      "dtype: int64\n",
      "\n",
      "数据重复值统计：\n",
      " 0\n"
     ]
    }
   ],
   "execution_count": 10
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:30.120848Z",
     "start_time": "2025-06-19T12:43:30.088229Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 遍历 DataFrame 中字符串类型的列\n",
    "for column in df.select_dtypes(include=['object']).columns:\n",
    "    # 统计该列中空字符串的数量\n",
    "    empty_count = df[df[column].str.strip() == ''][column].count()\n",
    "    print(f\"属性 '{column}' 中的空字符串数量: {empty_count}\")"
   ],
   "id": "ff795102064bb0f5",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "属性 '标题' 中的空字符串数量: 0\n",
      "属性 '作者' 中的空字符串数量: 51\n",
      "属性 '出版社' 中的空字符串数量: 85\n",
      "属性 '标签' 中的空字符串数量: 0\n",
      "属性 'URL' 中的空字符串数量: 0\n",
      "属性 '简介' 中的空字符串数量: 65\n"
     ]
    }
   ],
   "execution_count": 11
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:30.231235Z",
     "start_time": "2025-06-19T12:43:30.207522Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df.loc[df['作者'].str.strip() == '', '作者'] = '未知'\n",
    "df.loc[df['简介'].str.strip() == '', '简介'] = '暂无简介'\n",
    "df.loc[df['出版社'].str.strip() == '', '出版社'] = '未知'"
   ],
   "id": "3df0d4f31797eeb9",
   "outputs": [],
   "execution_count": 12
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:30.289689Z",
     "start_time": "2025-06-19T12:43:30.270769Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 计算均值、最大值、最小值\n",
    "stats = df[['收藏数', '推荐数', '价格']].agg(['mean', 'max', 'min','count'])\n",
    "print(stats)"
   ],
   "id": "f324824c9e29a711",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               收藏数          推荐数           价格\n",
      "mean     14.549541     0.102362    50.483951\n",
      "max    4162.000000    62.000000  2921.000000\n",
      "min       0.000000     0.000000     0.000000\n",
      "count  3048.000000  3048.000000  1992.000000\n"
     ]
    }
   ],
   "execution_count": 13
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:30.382627Z",
     "start_time": "2025-06-19T12:43:30.349444Z"
    }
   },
   "cell_type": "code",
   "source": [
    "split_tags = df['标签'].fillna('').str.split(',').apply(lambda x: [tag.strip() for tag in x])\n",
    "tag_counts = split_tags.explode().value_counts()\n",
    "tag_counts_df = tag_counts.reset_index()\n",
    "tag_counts_df.columns = ['标签', '出现次数']\n",
    "print(tag_counts_df)"
   ],
   "id": "68e41f057d426367",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       标签  出现次数\n",
      "0     未分类  2407\n",
      "1    心理咨询    88\n",
      "2      人格    74\n",
      "3    精神分析    55\n",
      "4    心理治疗    53\n",
      "..    ...   ...\n",
      "178    人本     1\n",
      "179    形式     1\n",
      "180   领导力     1\n",
      "181    临床     1\n",
      "182    指南     1\n",
      "\n",
      "[183 rows x 2 columns]\n"
     ]
    }
   ],
   "execution_count": 14
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:43:30.455163Z",
     "start_time": "2025-06-19T12:43:30.428453Z"
    }
   },
   "cell_type": "code",
   "source": [
    "Class_list = ['作者','出版社']\n",
    "for i in Class_list:\n",
    "    class_counts = df[i].value_counts()\n",
    "    print(f'============={i}的分类统计情况===============')\n",
    "    print(class_counts)"
   ],
   "id": "ba77c9b5576e82e9",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=============作者的分类统计情况===============\n",
      "作者\n",
      "未知                                    98\n",
      "弗洛伊德                                  18\n",
      "[奥]弗洛伊德                               15\n",
      "毕淑敏                                   11\n",
      "岳晓东                                   10\n",
      "                                      ..\n",
      "(美)D·W·卡罗尔                             1\n",
      "Jean0Baudrillard/SheilaFariaGlaser     1\n",
      "孙云晓                                    1\n",
      "[德]阿斯格德姆                               1\n",
      "古斯塔夫·勒庞(GustaveLeBoin)                 1\n",
      "Name: count, Length: 2515, dtype: int64\n",
      "=============出版社的分类统计情况===============\n",
      "出版社\n",
      "中国轻工业出版社                        149\n",
      "未知                              132\n",
      "中国人民大学出版社                       102\n",
      "华东师范大学出版社                        70\n",
      "北京大学出版社                          59\n",
      "                               ... \n",
      "中国人口出版社                           1\n",
      "中国民间文学出版社                         1\n",
      "PenguinBooksLtd;NewEdedition      1\n",
      "百家出版社                             1\n",
      "文化发展出版社(原印刷工业出版社)                 1\n",
      "Name: count, Length: 606, dtype: int64\n"
     ]
    }
   ],
   "execution_count": 15
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:47:26.410307Z",
     "start_time": "2025-06-19T12:44:02.523636Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import jieba\n",
    "import jieba.analyse\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from collections import defaultdict\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "# ==================== 常量定义 ====================\n",
    "PSYCHOLOGY_TERMS = [\n",
    "    \"心理学\", \"基础心理学\", \"应用心理学\", \"发展心理学\", \"社会心理学\", \n",
    "    \"教育心理学\", \"临床心理学\", \"认知心理学\", \"人格心理学\", \"心理咨询\",\n",
    "    \"心理治疗\", \"认知行为疗法\", \"精神分析疗法\", \"正念疗法\", \"情绪管理\",\n",
    "    \"依恋理论\", \"创伤疗愈\", \"抑郁\", \"焦虑\", \"大五人格\", \"心理测评\", \"MBTI\",\n",
    "    \"心理\", \"疗法\", \"咨询\", \"治疗\", \"情绪\", \"关系\", \"成长\", \"心境\", \"快感\",\n",
    "    \"绝望\", \"自责\", \"自杀\", \"失眠\", \"疲劳\", \"食欲\", \"注意\",\"认知\", \"归因\", \n",
    "    \"图式\", \"自动\", \"反刍\", \"无助\", \"无望\",\"疗法\", \"咨询\", \"治疗\", \"药物\", \n",
    "    \"电疗\", \"光照\", \"激活\", \"正念\",\"共病\", \"复发\", \"缓解\", \"康复\", \"病耻\", \"支持\", \"危机\"\n",
    "]  # 核心术语列表\n",
    "\n",
    "STOPWORDS_PATH = \"../../r/stopwords-master/cn_stopwords.txt\" #jieba分词文件路径\n",
    "TOP_N_TAGS = 3\n",
    "SIMILARITY_THRESHOLD = 0.25  # 阙值相似度\n",
    "\n",
    "# 初始化结巴分词\n",
    "for term in PSYCHOLOGY_TERMS:\n",
    "    jieba.add_word(term)\n",
    "\n",
    "# ==================== 核心函数 ====================\n",
    "def load_enhanced_stopwords(file_path):\n",
    "    #加载并过滤心理学相关停用词\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        stopwords = {line.strip() for line in f}\n",
    "    return stopwords\n",
    "\n",
    "def preprocess_text(text):\n",
    "    #文本预处理\n",
    "    if pd.isna(text):\n",
    "        return \"\"\n",
    "    valid_chars = [c for c in text if c.isalnum() or c in \"：，。、《》-\"]\n",
    "    return \"\".join(valid_chars).strip()\n",
    "\n",
    "def build_tag_knowledge(df):\n",
    "    #构建标签向量知识库\n",
    "    # 过滤已分类数据并合并内容\n",
    "    classified = df[df['标签'] != '未分类'].copy()\n",
    "    classified['内容'] = classified['标题'] + \" \" + classified['简介']\n",
    "    classified['内容'] = classified['内容'].apply(preprocess_text)\n",
    "    \n",
    "    # 训练全局向量化器\n",
    "    vectorizer = TfidfVectorizer(tokenizer=jieba.lcut_for_search)\n",
    "    all_texts = classified['内容'].tolist()\n",
    "    vectorizer.fit(all_texts)\n",
    "    \n",
    "    # 计算各标签的平均向量\n",
    "    tag_vectors = {}\n",
    "    for tag, group in classified.groupby('标签'):\n",
    "        combined_text = \" \".join(group['内容'])\n",
    "        if combined_text:\n",
    "            tag_vectors[tag] = vectorizer.transform([combined_text])\n",
    "    \n",
    "    return tag_vectors, vectorizer\n",
    "\n",
    "def recommend_tags(content, tag_vectors, vectorizer):\n",
    "    #推荐书籍标签\n",
    "    try:\n",
    "        # 预处理并向量化内容\n",
    "        processed = preprocess_text(content)\n",
    "        if not processed:\n",
    "            return []\n",
    "        \n",
    "        book_vec = vectorizer.transform([processed])\n",
    "        similarities = {}\n",
    "        \n",
    "        # 计算与各标签的相似度\n",
    "        for tag, tag_vec in tag_vectors.items():\n",
    "            sim = cosine_similarity(book_vec, tag_vec)[0][0]\n",
    "            if sim >= SIMILARITY_THRESHOLD:\n",
    "                similarities[tag] = sim\n",
    "        \n",
    "        # 优先取高相似度标签\n",
    "        sorted_tags = sorted(similarities.keys(), key=lambda t: similarities[t], reverse=True)\n",
    "        if len(sorted_tags) >= TOP_N_TAGS:\n",
    "            return sorted_tags[:TOP_N_TAGS]\n",
    "        \n",
    "        # 补充关键词标签\n",
    "        keywords = jieba.analyse.textrank(\n",
    "            processed, topK=TOP_N_TAGS*2, allowPOS=('n', 'vn', 'an')\n",
    "        )\n",
    "        return (sorted_tags + keywords)[:TOP_N_TAGS]\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"标签推荐异常: {str(e)}\")\n",
    "        return []\n",
    "\n",
    "# ==================== 主流程 ====================\n",
    "if __name__ == \"__main__\":\n",
    "    # 初始化配置\n",
    "    stopwords = load_enhanced_stopwords(STOPWORDS_PATH)\n",
    "    jieba.analyse.set_stop_words(STOPWORDS_PATH)  # 使用停用词\n",
    "    \n",
    "    # 构建知识库\n",
    "    print(\"构建标签知识库...\")\n",
    "    tag_vectors, vectorizer = build_tag_knowledge(df)\n",
    "    print(f\"成功构建包含 {len(tag_vectors)} 个标签的知识库\")\n",
    "    \n",
    "    # 处理未分类书籍\n",
    "    uncat_mask = df['标签'] == '未分类'\n",
    "    df['推荐标签'] = df.apply(\n",
    "        lambda row: \", \".join(recommend_tags(\n",
    "            f\"{row['标题']} {row['简介']}\", tag_vectors, vectorizer\n",
    "        )) if row['标签'] == '未分类' else row['标签'], \n",
    "        axis=1\n",
    "    )\n",
    "    \n",
    "    # 输出统计结果\n",
    "    tag_distribution = df['推荐标签'].str.split(',').explode().value_counts()\n",
    "    print(\"\\n标签分布统计:\")\n",
    "    print(tag_distribution.head(10).to_string())\n",
    "    \n",
    "    print(f\"\\n处理完成！总未分类数: {uncat_mask.sum()}, 成功推荐: {(df['推荐标签'] != '未分类').sum()}\")\n"
   ],
   "id": "672c18e382070b2a",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "构建标签知识库...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\python项目\\Spider\\.venv\\Lib\\site-packages\\sklearn\\feature_extraction\\text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "成功构建包含 170 个标签的知识库\n",
      "\n",
      "标签分布统计:\n",
      "推荐标签\n",
      " 人格      544\n",
      " 爱情心理    398\n",
      " 精神分析    380\n",
      " 心理治疗    289\n",
      " 心理学     270\n",
      " 案例      264\n",
      "人格       259\n",
      " 自我      249\n",
      " 教科书     244\n",
      "精神分析     243\n",
      "\n",
      "处理完成！总未分类数: 2407, 成功推荐: 3048\n"
     ]
    }
   ],
   "execution_count": 17
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:47:26.459572Z",
     "start_time": "2025-06-19T12:47:26.435009Z"
    }
   },
   "cell_type": "code",
   "source": "df",
   "id": "c877bd2918237102",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "        id                                                 标题  \\\n",
       "0        1                                              女性潜意识   \n",
       "1        2                                              怪诞行为学   \n",
       "2        3                                            人的潜能和价值   \n",
       "3        4                                              语言心理学   \n",
       "4        5                                         蛤蟆先生去看心理医生   \n",
       "...    ...                                                ...   \n",
       "3043  3044  Simulacra and Simulation (The Body, In Theory:...   \n",
       "3044  3045                           放飞梦想(The Dream Relesers)   \n",
       "3045  3046                 Mistakes Were Made (But Not by Me)   \n",
       "3046  3047                                            艺术与精神分析   \n",
       "3047  3048                                          遇见懂得付出的自己   \n",
       "\n",
       "                                      作者                        出版社      价格  \\\n",
       "0                                    阎勤民                      花城出版社     NaN   \n",
       "1                              [美]丹· 艾瑞里                      中信出版社   30.20   \n",
       "2                               [美]马斯洛等著                      华夏出版社     NaN   \n",
       "3                             (美)D·W·卡罗尔                  华东师范大学出版社   42.30   \n",
       "4                             [英]罗伯特·戴博德                    天津人民出版社     NaN   \n",
       "...                                  ...                        ...     ...   \n",
       "3043  Jean0Baudrillard/SheilaFariaGlaser  UniversityofMichiganPress  241.00   \n",
       "3044                              [美]闻道乐                    现代教育出版社   14.10   \n",
       "3045           CarolTavris/ElliotAronson                   Harcourt  249.00   \n",
       "3046                               彼得·福勒                    四川美术出版社     NaN   \n",
       "3047                              [美]查普曼            中华工商联合出版社有限责任公司    7.99   \n",
       "\n",
       "                           标签                                URL   收藏数  推荐数  \\\n",
       "0     心理咨询, 心理医生, 女性, 潜意识, 意识  http://www.ixinqing.com/book/1596   163    2   \n",
       "1                         未分类    http://www.ixinqing.com/book/28   248    3   \n",
       "2                   人本主义, 马斯洛  http://www.ixinqing.com/book/7466   130    0   \n",
       "3                         教科书   http://www.ixinqing.com/book/165   622    0   \n",
       "4          心理咨询, 情绪, 心理医生, 治愈  http://www.ixinqing.com/book/7662  1167    0   \n",
       "...                       ...                                ...   ...  ...   \n",
       "3043                      未分类  http://www.ixinqing.com/book/5717     0    0   \n",
       "3044                      未分类  http://www.ixinqing.com/book/4028     1    0   \n",
       "3045                      未分类  http://www.ixinqing.com/book/2925     0    0   \n",
       "3046                     精神分析  http://www.ixinqing.com/book/1550     1    0   \n",
       "3047                      未分类   http://www.ixinqing.com/book/102     5    0   \n",
       "\n",
       "                                                     简介  \\\n",
       "0     这是我国第一本以记录和分析心理门诊、心理咨询实例的形式，集中论述女性心理、开解女性性别密码的...   \n",
       "1     《怪诞行为学》中讲述的是我们常常暗下决心节食减肥，但是只要看到甜点小推车一过来，我们的决心就...   \n",
       "2     《人的潜能和价值》是人本主义心理学有关“人的潜能和价值”讨论的论文集。主要选录了马斯洛有关人...   \n",
       "3     心理语言学研究对认知科学、对心理学和语言学的理论建设具有重要作用。在语言教学、言语缺陷的诊断...   \n",
       "4     蛤蟆先生一向爱笑爱闹，如今却一反常态地郁郁寡欢，他一个人躲在屋里，连起床梳洗的力气都没有。朋...   \n",
       "...                                                 ...   \n",
       "3043  Baudillad(8) 哲学(8) 虚拟现实(3) 虚拟(3) postmodenism(...   \n",
       "3044  《放飞梦想》回应了以下问题及困惑：我们的潜能是如何被埋没的；梦想放飞者的3种特质；扼杀梦想的...   \n",
       "3045  Why do people dodgeesponsiility when things fa...   \n",
       "3046  《艺术与精神分析》一书有两个平行的主题。一是：研究审美本质一是：阐述新精神分析作者以后者作为...   \n",
       "3047  十几年前，盖瑞·查普曼博士推出了两性沟通的里程碑式著作《爱的五种语言》，其完美而实用的解决方...   \n",
       "\n",
       "                         推荐标签  \n",
       "0     心理咨询, 心理医生, 女性, 潜意识, 意识  \n",
       "1                爱情心理, 自我, 人格  \n",
       "2                   人本主义, 马斯洛  \n",
       "3                         教科书  \n",
       "4          心理咨询, 情绪, 心理医生, 治愈  \n",
       "...                       ...  \n",
       "3043                 虚拟现实, 哲学  \n",
       "3044              梦想, 特质, 金钥匙  \n",
       "3045                           \n",
       "3046                     精神分析  \n",
       "3047             爱情心理, 个性, 人格  \n",
       "\n",
       "[3048 rows x 11 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>标题</th>\n",
       "      <th>作者</th>\n",
       "      <th>出版社</th>\n",
       "      <th>价格</th>\n",
       "      <th>标签</th>\n",
       "      <th>URL</th>\n",
       "      <th>收藏数</th>\n",
       "      <th>推荐数</th>\n",
       "      <th>简介</th>\n",
       "      <th>推荐标签</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>女性潜意识</td>\n",
       "      <td>阎勤民</td>\n",
       "      <td>花城出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>心理咨询, 心理医生, 女性, 潜意识, 意识</td>\n",
       "      <td>http://www.ixinqing.com/book/1596</td>\n",
       "      <td>163</td>\n",
       "      <td>2</td>\n",
       "      <td>这是我国第一本以记录和分析心理门诊、心理咨询实例的形式，集中论述女性心理、开解女性性别密码的...</td>\n",
       "      <td>心理咨询, 心理医生, 女性, 潜意识, 意识</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>怪诞行为学</td>\n",
       "      <td>[美]丹· 艾瑞里</td>\n",
       "      <td>中信出版社</td>\n",
       "      <td>30.20</td>\n",
       "      <td>未分类</td>\n",
       "      <td>http://www.ixinqing.com/book/28</td>\n",
       "      <td>248</td>\n",
       "      <td>3</td>\n",
       "      <td>《怪诞行为学》中讲述的是我们常常暗下决心节食减肥，但是只要看到甜点小推车一过来，我们的决心就...</td>\n",
       "      <td>爱情心理, 自我, 人格</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>人的潜能和价值</td>\n",
       "      <td>[美]马斯洛等著</td>\n",
       "      <td>华夏出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>人本主义, 马斯洛</td>\n",
       "      <td>http://www.ixinqing.com/book/7466</td>\n",
       "      <td>130</td>\n",
       "      <td>0</td>\n",
       "      <td>《人的潜能和价值》是人本主义心理学有关“人的潜能和价值”讨论的论文集。主要选录了马斯洛有关人...</td>\n",
       "      <td>人本主义, 马斯洛</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>语言心理学</td>\n",
       "      <td>(美)D·W·卡罗尔</td>\n",
       "      <td>华东师范大学出版社</td>\n",
       "      <td>42.30</td>\n",
       "      <td>教科书</td>\n",
       "      <td>http://www.ixinqing.com/book/165</td>\n",
       "      <td>622</td>\n",
       "      <td>0</td>\n",
       "      <td>心理语言学研究对认知科学、对心理学和语言学的理论建设具有重要作用。在语言教学、言语缺陷的诊断...</td>\n",
       "      <td>教科书</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>蛤蟆先生去看心理医生</td>\n",
       "      <td>[英]罗伯特·戴博德</td>\n",
       "      <td>天津人民出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>心理咨询, 情绪, 心理医生, 治愈</td>\n",
       "      <td>http://www.ixinqing.com/book/7662</td>\n",
       "      <td>1167</td>\n",
       "      <td>0</td>\n",
       "      <td>蛤蟆先生一向爱笑爱闹，如今却一反常态地郁郁寡欢，他一个人躲在屋里，连起床梳洗的力气都没有。朋...</td>\n",
       "      <td>心理咨询, 情绪, 心理医生, 治愈</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3043</th>\n",
       "      <td>3044</td>\n",
       "      <td>Simulacra and Simulation (The Body, In Theory:...</td>\n",
       "      <td>Jean0Baudrillard/SheilaFariaGlaser</td>\n",
       "      <td>UniversityofMichiganPress</td>\n",
       "      <td>241.00</td>\n",
       "      <td>未分类</td>\n",
       "      <td>http://www.ixinqing.com/book/5717</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Baudillad(8) 哲学(8) 虚拟现实(3) 虚拟(3) postmodenism(...</td>\n",
       "      <td>虚拟现实, 哲学</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3044</th>\n",
       "      <td>3045</td>\n",
       "      <td>放飞梦想(The Dream Relesers)</td>\n",
       "      <td>[美]闻道乐</td>\n",
       "      <td>现代教育出版社</td>\n",
       "      <td>14.10</td>\n",
       "      <td>未分类</td>\n",
       "      <td>http://www.ixinqing.com/book/4028</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>《放飞梦想》回应了以下问题及困惑：我们的潜能是如何被埋没的；梦想放飞者的3种特质；扼杀梦想的...</td>\n",
       "      <td>梦想, 特质, 金钥匙</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3045</th>\n",
       "      <td>3046</td>\n",
       "      <td>Mistakes Were Made (But Not by Me)</td>\n",
       "      <td>CarolTavris/ElliotAronson</td>\n",
       "      <td>Harcourt</td>\n",
       "      <td>249.00</td>\n",
       "      <td>未分类</td>\n",
       "      <td>http://www.ixinqing.com/book/2925</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Why do people dodgeesponsiility when things fa...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3046</th>\n",
       "      <td>3047</td>\n",
       "      <td>艺术与精神分析</td>\n",
       "      <td>彼得·福勒</td>\n",
       "      <td>四川美术出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>精神分析</td>\n",
       "      <td>http://www.ixinqing.com/book/1550</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>《艺术与精神分析》一书有两个平行的主题。一是：研究审美本质一是：阐述新精神分析作者以后者作为...</td>\n",
       "      <td>精神分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3047</th>\n",
       "      <td>3048</td>\n",
       "      <td>遇见懂得付出的自己</td>\n",
       "      <td>[美]查普曼</td>\n",
       "      <td>中华工商联合出版社有限责任公司</td>\n",
       "      <td>7.99</td>\n",
       "      <td>未分类</td>\n",
       "      <td>http://www.ixinqing.com/book/102</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>十几年前，盖瑞·查普曼博士推出了两性沟通的里程碑式著作《爱的五种语言》，其完美而实用的解决方...</td>\n",
       "      <td>爱情心理, 个性, 人格</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3048 rows × 11 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 18
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:47:26.615964Z",
     "start_time": "2025-06-19T12:47:26.607793Z"
    }
   },
   "cell_type": "code",
   "source": "df['标签'] = df['标签'].mask(df['标签'] == '未分类', df['推荐标签'])",
   "id": "17a4e0b27361eb57",
   "outputs": [],
   "execution_count": 19
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:47:26.732712Z",
     "start_time": "2025-06-19T12:47:26.713770Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df1 = df.drop('推荐标签',axis=1)\n",
    "df1"
   ],
   "id": "e8c4454a9cdde022",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "        id                                                 标题  \\\n",
       "0        1                                              女性潜意识   \n",
       "1        2                                              怪诞行为学   \n",
       "2        3                                            人的潜能和价值   \n",
       "3        4                                              语言心理学   \n",
       "4        5                                         蛤蟆先生去看心理医生   \n",
       "...    ...                                                ...   \n",
       "3043  3044  Simulacra and Simulation (The Body, In Theory:...   \n",
       "3044  3045                           放飞梦想(The Dream Relesers)   \n",
       "3045  3046                 Mistakes Were Made (But Not by Me)   \n",
       "3046  3047                                            艺术与精神分析   \n",
       "3047  3048                                          遇见懂得付出的自己   \n",
       "\n",
       "                                      作者                        出版社      价格  \\\n",
       "0                                    阎勤民                      花城出版社     NaN   \n",
       "1                              [美]丹· 艾瑞里                      中信出版社   30.20   \n",
       "2                               [美]马斯洛等著                      华夏出版社     NaN   \n",
       "3                             (美)D·W·卡罗尔                  华东师范大学出版社   42.30   \n",
       "4                             [英]罗伯特·戴博德                    天津人民出版社     NaN   \n",
       "...                                  ...                        ...     ...   \n",
       "3043  Jean0Baudrillard/SheilaFariaGlaser  UniversityofMichiganPress  241.00   \n",
       "3044                              [美]闻道乐                    现代教育出版社   14.10   \n",
       "3045           CarolTavris/ElliotAronson                   Harcourt  249.00   \n",
       "3046                               彼得·福勒                    四川美术出版社     NaN   \n",
       "3047                              [美]查普曼            中华工商联合出版社有限责任公司    7.99   \n",
       "\n",
       "                           标签                                URL   收藏数  推荐数  \\\n",
       "0     心理咨询, 心理医生, 女性, 潜意识, 意识  http://www.ixinqing.com/book/1596   163    2   \n",
       "1                爱情心理, 自我, 人格    http://www.ixinqing.com/book/28   248    3   \n",
       "2                   人本主义, 马斯洛  http://www.ixinqing.com/book/7466   130    0   \n",
       "3                         教科书   http://www.ixinqing.com/book/165   622    0   \n",
       "4          心理咨询, 情绪, 心理医生, 治愈  http://www.ixinqing.com/book/7662  1167    0   \n",
       "...                       ...                                ...   ...  ...   \n",
       "3043                 虚拟现实, 哲学  http://www.ixinqing.com/book/5717     0    0   \n",
       "3044              梦想, 特质, 金钥匙  http://www.ixinqing.com/book/4028     1    0   \n",
       "3045                           http://www.ixinqing.com/book/2925     0    0   \n",
       "3046                     精神分析  http://www.ixinqing.com/book/1550     1    0   \n",
       "3047             爱情心理, 个性, 人格   http://www.ixinqing.com/book/102     5    0   \n",
       "\n",
       "                                                     简介  \n",
       "0     这是我国第一本以记录和分析心理门诊、心理咨询实例的形式，集中论述女性心理、开解女性性别密码的...  \n",
       "1     《怪诞行为学》中讲述的是我们常常暗下决心节食减肥，但是只要看到甜点小推车一过来，我们的决心就...  \n",
       "2     《人的潜能和价值》是人本主义心理学有关“人的潜能和价值”讨论的论文集。主要选录了马斯洛有关人...  \n",
       "3     心理语言学研究对认知科学、对心理学和语言学的理论建设具有重要作用。在语言教学、言语缺陷的诊断...  \n",
       "4     蛤蟆先生一向爱笑爱闹，如今却一反常态地郁郁寡欢，他一个人躲在屋里，连起床梳洗的力气都没有。朋...  \n",
       "...                                                 ...  \n",
       "3043  Baudillad(8) 哲学(8) 虚拟现实(3) 虚拟(3) postmodenism(...  \n",
       "3044  《放飞梦想》回应了以下问题及困惑：我们的潜能是如何被埋没的；梦想放飞者的3种特质；扼杀梦想的...  \n",
       "3045  Why do people dodgeesponsiility when things fa...  \n",
       "3046  《艺术与精神分析》一书有两个平行的主题。一是：研究审美本质一是：阐述新精神分析作者以后者作为...  \n",
       "3047  十几年前，盖瑞·查普曼博士推出了两性沟通的里程碑式著作《爱的五种语言》，其完美而实用的解决方...  \n",
       "\n",
       "[3048 rows x 10 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>标题</th>\n",
       "      <th>作者</th>\n",
       "      <th>出版社</th>\n",
       "      <th>价格</th>\n",
       "      <th>标签</th>\n",
       "      <th>URL</th>\n",
       "      <th>收藏数</th>\n",
       "      <th>推荐数</th>\n",
       "      <th>简介</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>女性潜意识</td>\n",
       "      <td>阎勤民</td>\n",
       "      <td>花城出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>心理咨询, 心理医生, 女性, 潜意识, 意识</td>\n",
       "      <td>http://www.ixinqing.com/book/1596</td>\n",
       "      <td>163</td>\n",
       "      <td>2</td>\n",
       "      <td>这是我国第一本以记录和分析心理门诊、心理咨询实例的形式，集中论述女性心理、开解女性性别密码的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>怪诞行为学</td>\n",
       "      <td>[美]丹· 艾瑞里</td>\n",
       "      <td>中信出版社</td>\n",
       "      <td>30.20</td>\n",
       "      <td>爱情心理, 自我, 人格</td>\n",
       "      <td>http://www.ixinqing.com/book/28</td>\n",
       "      <td>248</td>\n",
       "      <td>3</td>\n",
       "      <td>《怪诞行为学》中讲述的是我们常常暗下决心节食减肥，但是只要看到甜点小推车一过来，我们的决心就...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>人的潜能和价值</td>\n",
       "      <td>[美]马斯洛等著</td>\n",
       "      <td>华夏出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>人本主义, 马斯洛</td>\n",
       "      <td>http://www.ixinqing.com/book/7466</td>\n",
       "      <td>130</td>\n",
       "      <td>0</td>\n",
       "      <td>《人的潜能和价值》是人本主义心理学有关“人的潜能和价值”讨论的论文集。主要选录了马斯洛有关人...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>语言心理学</td>\n",
       "      <td>(美)D·W·卡罗尔</td>\n",
       "      <td>华东师范大学出版社</td>\n",
       "      <td>42.30</td>\n",
       "      <td>教科书</td>\n",
       "      <td>http://www.ixinqing.com/book/165</td>\n",
       "      <td>622</td>\n",
       "      <td>0</td>\n",
       "      <td>心理语言学研究对认知科学、对心理学和语言学的理论建设具有重要作用。在语言教学、言语缺陷的诊断...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>蛤蟆先生去看心理医生</td>\n",
       "      <td>[英]罗伯特·戴博德</td>\n",
       "      <td>天津人民出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>心理咨询, 情绪, 心理医生, 治愈</td>\n",
       "      <td>http://www.ixinqing.com/book/7662</td>\n",
       "      <td>1167</td>\n",
       "      <td>0</td>\n",
       "      <td>蛤蟆先生一向爱笑爱闹，如今却一反常态地郁郁寡欢，他一个人躲在屋里，连起床梳洗的力气都没有。朋...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3043</th>\n",
       "      <td>3044</td>\n",
       "      <td>Simulacra and Simulation (The Body, In Theory:...</td>\n",
       "      <td>Jean0Baudrillard/SheilaFariaGlaser</td>\n",
       "      <td>UniversityofMichiganPress</td>\n",
       "      <td>241.00</td>\n",
       "      <td>虚拟现实, 哲学</td>\n",
       "      <td>http://www.ixinqing.com/book/5717</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Baudillad(8) 哲学(8) 虚拟现实(3) 虚拟(3) postmodenism(...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3044</th>\n",
       "      <td>3045</td>\n",
       "      <td>放飞梦想(The Dream Relesers)</td>\n",
       "      <td>[美]闻道乐</td>\n",
       "      <td>现代教育出版社</td>\n",
       "      <td>14.10</td>\n",
       "      <td>梦想, 特质, 金钥匙</td>\n",
       "      <td>http://www.ixinqing.com/book/4028</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>《放飞梦想》回应了以下问题及困惑：我们的潜能是如何被埋没的；梦想放飞者的3种特质；扼杀梦想的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3045</th>\n",
       "      <td>3046</td>\n",
       "      <td>Mistakes Were Made (But Not by Me)</td>\n",
       "      <td>CarolTavris/ElliotAronson</td>\n",
       "      <td>Harcourt</td>\n",
       "      <td>249.00</td>\n",
       "      <td></td>\n",
       "      <td>http://www.ixinqing.com/book/2925</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Why do people dodgeesponsiility when things fa...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3046</th>\n",
       "      <td>3047</td>\n",
       "      <td>艺术与精神分析</td>\n",
       "      <td>彼得·福勒</td>\n",
       "      <td>四川美术出版社</td>\n",
       "      <td>NaN</td>\n",
       "      <td>精神分析</td>\n",
       "      <td>http://www.ixinqing.com/book/1550</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>《艺术与精神分析》一书有两个平行的主题。一是：研究审美本质一是：阐述新精神分析作者以后者作为...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3047</th>\n",
       "      <td>3048</td>\n",
       "      <td>遇见懂得付出的自己</td>\n",
       "      <td>[美]查普曼</td>\n",
       "      <td>中华工商联合出版社有限责任公司</td>\n",
       "      <td>7.99</td>\n",
       "      <td>爱情心理, 个性, 人格</td>\n",
       "      <td>http://www.ixinqing.com/book/102</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>十几年前，盖瑞·查普曼博士推出了两性沟通的里程碑式著作《爱的五种语言》，其完美而实用的解决方...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3048 rows × 10 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 20
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:49:18.783700Z",
     "start_time": "2025-06-19T12:49:18.775020Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df1['价格'] = df1['价格'].fillna(-1)  # 用特殊值填充\n",
    "df1['标签'] = df1['标签'].fillna(\"未分类\")"
   ],
   "id": "41c495f2649daf0a",
   "outputs": [],
   "execution_count": 24
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-19T12:50:28.789747Z",
     "start_time": "2025-06-19T12:50:28.733785Z"
    }
   },
   "cell_type": "code",
   "source": "df1.to_csv('data_clear.csv', index=False,encoding='utf-8-sig')",
   "id": "e41cb779b28640d1",
   "outputs": [],
   "execution_count": 27
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "35a3adaa7a9a3259"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
